In [751]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
%matplotlib inline


#html export
import plotly.io as pio
pio.renderers.default = 'notebook'
In [752]:
df = pd.read_csv('cardata.csv')
df
Out[752]:
Make Model Year Engine Fuel Type Engine HP Engine Cylinders Transmission Type Driven_Wheels Number of Doors Market Category Vehicle Size Vehicle Style highway MPG city mpg Popularity MSRP
0 BMW 1 Series M 2011 premium unleaded (required) 335.0 6.0 MANUAL rear wheel drive 2.0 Factory Tuner,Luxury,High-Performance Compact Coupe 26 19 3916 46135
1 BMW 1 Series 2011 premium unleaded (required) 300.0 6.0 MANUAL rear wheel drive 2.0 Luxury,Performance Compact Convertible 28 19 3916 40650
2 BMW 1 Series 2011 premium unleaded (required) 300.0 6.0 MANUAL rear wheel drive 2.0 Luxury,High-Performance Compact Coupe 28 20 3916 36350
3 BMW 1 Series 2011 premium unleaded (required) 230.0 6.0 MANUAL rear wheel drive 2.0 Luxury,Performance Compact Coupe 28 18 3916 29450
4 BMW 1 Series 2011 premium unleaded (required) 230.0 6.0 MANUAL rear wheel drive 2.0 Luxury Compact Convertible 28 18 3916 34500
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
11909 Acura ZDX 2012 premium unleaded (required) 300.0 6.0 AUTOMATIC all wheel drive 4.0 Crossover,Hatchback,Luxury Midsize 4dr Hatchback 23 16 204 46120
11910 Acura ZDX 2012 premium unleaded (required) 300.0 6.0 AUTOMATIC all wheel drive 4.0 Crossover,Hatchback,Luxury Midsize 4dr Hatchback 23 16 204 56670
11911 Acura ZDX 2012 premium unleaded (required) 300.0 6.0 AUTOMATIC all wheel drive 4.0 Crossover,Hatchback,Luxury Midsize 4dr Hatchback 23 16 204 50620
11912 Acura ZDX 2013 premium unleaded (recommended) 300.0 6.0 AUTOMATIC all wheel drive 4.0 Crossover,Hatchback,Luxury Midsize 4dr Hatchback 23 16 204 50920
11913 Lincoln Zephyr 2006 regular unleaded 221.0 6.0 AUTOMATIC front wheel drive 4.0 Luxury Midsize Sedan 26 17 61 28995

11914 rows × 16 columns

In [753]:
df.columns
Out[753]:
Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')
In [754]:
# make column titles consistent
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()
Out[754]:
make model year engine_fuel_type engine_hp engine_cylinders transmission_type driven_wheels number_of_doors market_category vehicle_size vehicle_style highway_mpg city_mpg popularity msrp
0 BMW 1 Series M 2011 premium unleaded (required) 335.0 6.0 MANUAL rear wheel drive 2.0 Factory Tuner,Luxury,High-Performance Compact Coupe 26 19 3916 46135
1 BMW 1 Series 2011 premium unleaded (required) 300.0 6.0 MANUAL rear wheel drive 2.0 Luxury,Performance Compact Convertible 28 19 3916 40650
2 BMW 1 Series 2011 premium unleaded (required) 300.0 6.0 MANUAL rear wheel drive 2.0 Luxury,High-Performance Compact Coupe 28 20 3916 36350
3 BMW 1 Series 2011 premium unleaded (required) 230.0 6.0 MANUAL rear wheel drive 2.0 Luxury,Performance Compact Coupe 28 18 3916 29450
4 BMW 1 Series 2011 premium unleaded (required) 230.0 6.0 MANUAL rear wheel drive 2.0 Luxury Compact Convertible 28 18 3916 34500
In [755]:
# getting columns with str type
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings
Out[755]:
['make',
 'model',
 'engine_fuel_type',
 'transmission_type',
 'driven_wheels',
 'market_category',
 'vehicle_size',
 'vehicle_style']
In [756]:
# ensuring string data has a standard format
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ', '_')
In [757]:
df.head()
Out[757]:
make model year engine_fuel_type engine_hp engine_cylinders transmission_type driven_wheels number_of_doors market_category vehicle_size vehicle_style highway_mpg city_mpg popularity msrp
0 bmw 1_series_m 2011 premium_unleaded_(required) 335.0 6.0 manual rear_wheel_drive 2.0 factory_tuner,luxury,high-performance compact coupe 26 19 3916 46135
1 bmw 1_series 2011 premium_unleaded_(required) 300.0 6.0 manual rear_wheel_drive 2.0 luxury,performance compact convertible 28 19 3916 40650
2 bmw 1_series 2011 premium_unleaded_(required) 300.0 6.0 manual rear_wheel_drive 2.0 luxury,high-performance compact coupe 28 20 3916 36350
3 bmw 1_series 2011 premium_unleaded_(required) 230.0 6.0 manual rear_wheel_drive 2.0 luxury,performance compact coupe 28 18 3916 29450
4 bmw 1_series 2011 premium_unleaded_(required) 230.0 6.0 manual rear_wheel_drive 2.0 luxury compact convertible 28 18 3916 34500

Exploratory data analysis¶

In [758]:
for col in df.columns:
    print(col)
    print(df[col].unique()[:5]) # first 5 unique values
    print(df[col].nunique())
    print()
    
make
['bmw' 'audi' 'fiat' 'mercedes-benz' 'chrysler']
48

model
['1_series_m' '1_series' '100' '124_spider' '190-class']
914

year
[2011 2012 2013 1992 1993]
28

engine_fuel_type
['premium_unleaded_(required)' 'regular_unleaded'
 'premium_unleaded_(recommended)' 'flex-fuel_(unleaded/e85)' 'diesel']
10

engine_hp
[335. 300. 230. 320. 172.]
356

engine_cylinders
[ 6.  4.  5.  8. 12.]
9

transmission_type
['manual' 'automatic' 'automated_manual' 'direct_drive' 'unknown']
5

driven_wheels
['rear_wheel_drive' 'front_wheel_drive' 'all_wheel_drive'
 'four_wheel_drive']
4

number_of_doors
[ 2.  4.  3. nan]
3

market_category
['factory_tuner,luxury,high-performance' 'luxury,performance'
 'luxury,high-performance' 'luxury' 'performance']
71

vehicle_size
['compact' 'midsize' 'large']
3

vehicle_style
['coupe' 'convertible' 'sedan' 'wagon' '4dr_hatchback']
16

highway_mpg
[26 28 27 25 24]
59

city_mpg
[19 20 18 17 16]
69

popularity
[3916 3105  819  617 1013]
48

msrp
[46135 40650 36350 29450 34500]
6049

Distribution of price¶
In [759]:
sns.histplot(df['msrp'], bins = 50) # bins == no. of bars
Out[759]:
<Axes: xlabel='msrp', ylabel='Count'>
No description has been provided for this image
In [760]:
px.histogram(df, x=df['msrp'])
In [761]:
# less expensive cars
sns.histplot(df.msrp[df['msrp'] < 100000], bins = 50)
Out[761]:
<Axes: xlabel='msrp', ylabel='Count'>
No description has been provided for this image
In [762]:
# less expensive cars
px.histogram(df, x=df.msrp[df['msrp'] < 100000])

∴ The prices have a long tail distribution as most prices are cheap and few expensive ones
This type of distribution is not good for ML as the tail will confuse our model
Hence we get rid of the tail by applying logarithmic distribution to the price

In [763]:
# example of how a logarithmic distribution behaves
np.log1p([1, 10, 1000, 100000])
Out[763]:
array([ 0.69314718,  2.39789527,  6.90875478, 11.51293546])
In [764]:
price_logs = np.log1p(df['msrp'])
In [765]:
sns.histplot(price_logs)
Out[765]:
<Axes: xlabel='msrp', ylabel='Count'>
No description has been provided for this image

When you use np.log1p (the natural logarithm of 1+𝑥), it compresses larger values more than smaller ones. This transformation reduces the gap between extremely high and moderate prices, effectively pulling the long tail back in into a normal distribution

Missing values¶
In [766]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_fuel_type   11911 non-null  object 
 4   engine_hp          11845 non-null  float64
 5   engine_cylinders   11884 non-null  float64
 6   transmission_type  11914 non-null  object 
 7   driven_wheels      11914 non-null  object 
 8   number_of_doors    11908 non-null  float64
 9   market_category    8172 non-null   object 
 10  vehicle_size       11914 non-null  object 
 11  vehicle_style      11914 non-null  object 
 12  highway_mpg        11914 non-null  int64  
 13  city_mpg           11914 non-null  int64  
 14  popularity         11914 non-null  int64  
 15  msrp               11914 non-null  int64  
dtypes: float64(3), int64(5), object(8)
memory usage: 1.5+ MB
In [767]:
df.isna().sum()  
Out[767]:
make                    0
model                   0
year                    0
engine_fuel_type        3
engine_hp              69
engine_cylinders       30
transmission_type       0
driven_wheels           0
number_of_doors         6
market_category      3742
vehicle_size            0
vehicle_style           0
highway_mpg             0
city_mpg                0
popularity              0
msrp                    0
dtype: int64

Setting up a validation framework¶

In [768]:
# entire dataset
n = len(df)
n
Out[768]:
11914
In [769]:
# dividing dataset into 20% Validate, 20% Test, 60% Train,
n_val = int(len(df) * 0.2)
print(n_val)
print()
n_test = int(len(df) * 0.2)
print(n_test)
print()
n_train = int(len(df) * 0.6)
print(n_train)
print()
2382

2382

7148

In [770]:
n , n_val + n_test + n_train
Out[770]:
(11914, 11912)
In [771]:
# to ensure all records are used
n_val = int(len(df) * 0.2)
print(n_val)
n_test = int(len(df) * 0.2)
print(n_test)
n_train = n - n_val - n_test
print(n_train)
2382
2382
7150
In [772]:
(n , n_val + n_test + n_train)
Out[772]:
(11914, 11914)
In [773]:
# subdividing the data
df_val = df[:n_val]
df_val
Out[773]:
make model year engine_fuel_type engine_hp engine_cylinders transmission_type driven_wheels number_of_doors market_category vehicle_size vehicle_style highway_mpg city_mpg popularity msrp
0 bmw 1_series_m 2011 premium_unleaded_(required) 335.0 6.0 manual rear_wheel_drive 2.0 factory_tuner,luxury,high-performance compact coupe 26 19 3916 46135
1 bmw 1_series 2011 premium_unleaded_(required) 300.0 6.0 manual rear_wheel_drive 2.0 luxury,performance compact convertible 28 19 3916 40650
2 bmw 1_series 2011 premium_unleaded_(required) 300.0 6.0 manual rear_wheel_drive 2.0 luxury,high-performance compact coupe 28 20 3916 36350
3 bmw 1_series 2011 premium_unleaded_(required) 230.0 6.0 manual rear_wheel_drive 2.0 luxury,performance compact coupe 28 18 3916 29450
4 bmw 1_series 2011 premium_unleaded_(required) 230.0 6.0 manual rear_wheel_drive 2.0 luxury compact convertible 28 18 3916 34500
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2377 porsche cayenne 2016 premium_unleaded_(required) 570.0 8.0 automatic all_wheel_drive 4.0 crossover,luxury,high-performance midsize 4dr_suv 21 14 1715 157300
2378 porsche cayenne 2016 diesel 240.0 6.0 automatic all_wheel_drive 4.0 crossover,luxury,diesel midsize 4dr_suv 29 20 1715 62300
2379 porsche cayenne 2017 premium_unleaded_(required) 520.0 8.0 automatic all_wheel_drive 4.0 crossover,luxury,performance midsize 4dr_suv 21 14 1715 116500
2380 porsche cayenne 2017 premium_unleaded_(required) 300.0 6.0 automatic all_wheel_drive 4.0 crossover,luxury midsize 4dr_suv 24 18 1715 59600
2381 porsche cayenne 2017 premium_unleaded_(required) 440.0 6.0 automatic all_wheel_drive 4.0 crossover,luxury,high-performance midsize 4dr_suv 23 16 1715 97200

2382 rows × 16 columns

In [774]:
df_test = df[n_val:n_val + n_test]
df_test
Out[774]:
make model year engine_fuel_type engine_hp engine_cylinders transmission_type driven_wheels number_of_doors market_category vehicle_size vehicle_style highway_mpg city_mpg popularity msrp
2382 porsche cayenne 2017 premium_unleaded_(required) 570.0 8.0 automatic all_wheel_drive 4.0 crossover,luxury,high-performance midsize 4dr_suv 21 14 1715 159600
2383 porsche cayenne 2017 premium_unleaded_(required) 420.0 6.0 automatic all_wheel_drive 4.0 crossover,luxury,performance midsize 4dr_suv 24 17 1715 76200
2384 porsche cayman_s 2006 premium_unleaded_(required) 295.0 6.0 manual rear_wheel_drive 2.0 luxury,high-performance compact coupe 26 18 1715 58900
2385 porsche cayman 2014 premium_unleaded_(required) 275.0 6.0 manual rear_wheel_drive 2.0 luxury,high-performance compact coupe 30 20 1715 52600
2386 porsche cayman 2014 premium_unleaded_(required) 325.0 6.0 manual rear_wheel_drive 2.0 luxury,high-performance compact coupe 28 20 1715 63800
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4759 ford flex 2016 premium_unleaded_(recommended) 365.0 6.0 automatic all_wheel_drive 4.0 crossover large wagon 21 15 5657 42600
4760 ford flex 2016 regular_unleaded 287.0 6.0 automatic front_wheel_drive 4.0 crossover,performance large wagon 23 16 5657 32300
4761 ford flex 2016 regular_unleaded 287.0 6.0 automatic front_wheel_drive 4.0 crossover large wagon 23 16 5657 29600
4762 ford flex 2016 regular_unleaded 287.0 6.0 automatic all_wheel_drive 4.0 crossover,performance large wagon 22 16 5657 34250
4763 ford flex 2016 regular_unleaded 287.0 6.0 automatic all_wheel_drive 4.0 crossover,performance large wagon 22 16 5657 39750

2382 rows × 16 columns

In [775]:
df_train = df[n_val + n_test:]
df_train
Out[775]:
make model year engine_fuel_type engine_hp engine_cylinders transmission_type driven_wheels number_of_doors market_category vehicle_size vehicle_style highway_mpg city_mpg popularity msrp
4764 ford flex 2016 regular_unleaded 287.0 6.0 automatic front_wheel_drive 4.0 crossover,performance large wagon 23 16 5657 37800
4765 ford flex 2017 premium_unleaded_(recommended) 365.0 6.0 automatic all_wheel_drive 4.0 crossover large wagon 21 15 5657 43030
4766 ford flex 2017 regular_unleaded 287.0 6.0 automatic all_wheel_drive 4.0 crossover,performance large wagon 22 16 5657 40180
4767 ford flex 2017 regular_unleaded 287.0 6.0 automatic front_wheel_drive 4.0 crossover,performance large wagon 23 16 5657 32730
4768 ford flex 2017 regular_unleaded 287.0 6.0 automatic front_wheel_drive 4.0 crossover,performance large wagon 23 16 5657 38230
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
11909 acura zdx 2012 premium_unleaded_(required) 300.0 6.0 automatic all_wheel_drive 4.0 crossover,hatchback,luxury midsize 4dr_hatchback 23 16 204 46120
11910 acura zdx 2012 premium_unleaded_(required) 300.0 6.0 automatic all_wheel_drive 4.0 crossover,hatchback,luxury midsize 4dr_hatchback 23 16 204 56670
11911 acura zdx 2012 premium_unleaded_(required) 300.0 6.0 automatic all_wheel_drive 4.0 crossover,hatchback,luxury midsize 4dr_hatchback 23 16 204 50620
11912 acura zdx 2013 premium_unleaded_(recommended) 300.0 6.0 automatic all_wheel_drive 4.0 crossover,hatchback,luxury midsize 4dr_hatchback 23 16 204 50920
11913 lincoln zephyr 2006 regular_unleaded 221.0 6.0 automatic front_wheel_drive 4.0 luxury midsize sedan 26 17 61 28995

7150 rows × 16 columns

The data is sequential and we dont want that when training our model

In [776]:
# shuffling the records
np.arange(n)
Out[776]:
array([    0,     1,     2, ..., 11911, 11912, 11913])
In [777]:
idx = np.arange(n)
df.iloc[idx[:10]]
Out[777]:
make model year engine_fuel_type engine_hp engine_cylinders transmission_type driven_wheels number_of_doors market_category vehicle_size vehicle_style highway_mpg city_mpg popularity msrp
0 bmw 1_series_m 2011 premium_unleaded_(required) 335.0 6.0 manual rear_wheel_drive 2.0 factory_tuner,luxury,high-performance compact coupe 26 19 3916 46135
1 bmw 1_series 2011 premium_unleaded_(required) 300.0 6.0 manual rear_wheel_drive 2.0 luxury,performance compact convertible 28 19 3916 40650
2 bmw 1_series 2011 premium_unleaded_(required) 300.0 6.0 manual rear_wheel_drive 2.0 luxury,high-performance compact coupe 28 20 3916 36350
3 bmw 1_series 2011 premium_unleaded_(required) 230.0 6.0 manual rear_wheel_drive 2.0 luxury,performance compact coupe 28 18 3916 29450
4 bmw 1_series 2011 premium_unleaded_(required) 230.0 6.0 manual rear_wheel_drive 2.0 luxury compact convertible 28 18 3916 34500
5 bmw 1_series 2012 premium_unleaded_(required) 230.0 6.0 manual rear_wheel_drive 2.0 luxury,performance compact coupe 28 18 3916 31200
6 bmw 1_series 2012 premium_unleaded_(required) 300.0 6.0 manual rear_wheel_drive 2.0 luxury,performance compact convertible 26 17 3916 44100
7 bmw 1_series 2012 premium_unleaded_(required) 300.0 6.0 manual rear_wheel_drive 2.0 luxury,high-performance compact coupe 28 20 3916 39300
8 bmw 1_series 2012 premium_unleaded_(required) 230.0 6.0 manual rear_wheel_drive 2.0 luxury compact convertible 28 18 3916 36900
9 bmw 1_series 2013 premium_unleaded_(required) 230.0 6.0 manual rear_wheel_drive 2.0 luxury compact convertible 27 18 3916 37200
In [778]:
np.random.seed(2) # to get the same random numbers every time you run your code
np.random.shuffle(idx)
print(idx)
[2735 6720 5878 ... 6637 2575 7336]
In [779]:
# confirming whether the dataset is shuffled by id
df.iloc[idx[:10]] # first 10 records of shuffled dataset
Out[779]:
make model year engine_fuel_type engine_hp engine_cylinders transmission_type driven_wheels number_of_doors market_category vehicle_size vehicle_style highway_mpg city_mpg popularity msrp
2735 chevrolet cobalt 2008 regular_unleaded 148.0 4.0 manual front_wheel_drive 2.0 NaN compact coupe 33 24 1385 14410
6720 toyota matrix 2012 regular_unleaded 132.0 4.0 automatic front_wheel_drive 4.0 hatchback compact 4dr_hatchback 32 25 2031 19685
5878 subaru impreza 2016 regular_unleaded 148.0 4.0 automatic all_wheel_drive 4.0 hatchback compact 4dr_hatchback 37 28 640 19795
11190 volkswagen vanagon 1991 regular_unleaded 90.0 4.0 manual rear_wheel_drive 3.0 NaN large passenger_minivan 18 16 873 2000
4554 ford f-150 2017 flex-fuel_(unleaded/e85) 385.0 8.0 automatic four_wheel_drive 4.0 flex_fuel large crew_cab_pickup 21 15 5657 56260
8001 volkswagen rabbit 2008 regular_unleaded 170.0 5.0 manual front_wheel_drive 4.0 hatchback compact 4dr_hatchback 29 22 873 17575
2882 bentley continental_gtc 2013 premium_unleaded_(required) 500.0 8.0 automatic all_wheel_drive 2.0 exotic,luxury,high-performance midsize convertible 24 14 520 191400
649 bmw 6_series 2015 premium_unleaded_(required) 315.0 6.0 automatic rear_wheel_drive 2.0 luxury,performance midsize coupe 32 21 3916 76100
616 maybach 57 2012 premium_unleaded_(required) 543.0 12.0 automatic rear_wheel_drive 4.0 exotic,luxury large sedan 16 10 67 379050
4459 ford f-150_heritage 2004 regular_unleaded 202.0 6.0 manual four_wheel_drive 2.0 NaN large regular_cab_pickup 18 13 5657 26030
In [780]:
# indexes of the last 60% of the dataset
idx[n_train:]
Out[780]:
array([2779, 3708, 4794, ..., 6637, 2575, 7336])
In [781]:
df_train = df.iloc[idx[:n_train]] # first 60 % to be for training
df_train
Out[781]:
make model year engine_fuel_type engine_hp engine_cylinders transmission_type driven_wheels number_of_doors market_category vehicle_size vehicle_style highway_mpg city_mpg popularity msrp
2735 chevrolet cobalt 2008 regular_unleaded 148.0 4.0 manual front_wheel_drive 2.0 NaN compact coupe 33 24 1385 14410
6720 toyota matrix 2012 regular_unleaded 132.0 4.0 automatic front_wheel_drive 4.0 hatchback compact 4dr_hatchback 32 25 2031 19685
5878 subaru impreza 2016 regular_unleaded 148.0 4.0 automatic all_wheel_drive 4.0 hatchback compact 4dr_hatchback 37 28 640 19795
11190 volkswagen vanagon 1991 regular_unleaded 90.0 4.0 manual rear_wheel_drive 3.0 NaN large passenger_minivan 18 16 873 2000
4554 ford f-150 2017 flex-fuel_(unleaded/e85) 385.0 8.0 automatic four_wheel_drive 4.0 flex_fuel large crew_cab_pickup 21 15 5657 56260
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
434 bmw 4_series 2015 premium_unleaded_(required) 300.0 6.0 automatic rear_wheel_drive 2.0 luxury,performance midsize convertible 31 20 3916 54900
1902 volkswagen beetle 2015 premium_unleaded_(recommended) 210.0 4.0 automated_manual front_wheel_drive 2.0 hatchback,performance compact 2dr_hatchback 30 24 873 29215
9334 gmc sierra_1500 2015 flex-fuel_(unleaded/e85) 285.0 6.0 automatic four_wheel_drive 4.0 flex_fuel large extended_cab_pickup 22 17 549 34675
5284 rolls-royce ghost 2014 premium_unleaded_(required) 563.0 12.0 automatic rear_wheel_drive 4.0 exotic,luxury,performance large sedan 21 13 86 303300
2420 volkswagen cc 2017 premium_unleaded_(recommended) 200.0 4.0 automated_manual front_wheel_drive 4.0 performance midsize sedan 31 22 873 37820

7150 rows × 16 columns

In [782]:
df_val = df.iloc[idx[n_train:n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val:]]
df_val
Out[782]:
make model year engine_fuel_type engine_hp engine_cylinders transmission_type driven_wheels number_of_doors market_category vehicle_size vehicle_style highway_mpg city_mpg popularity msrp
2779 chevrolet colorado 2015 regular_unleaded 200.0 4.0 automatic four_wheel_drive 4.0 NaN compact extended_cab_pickup 25 19 1385 26885
3708 mercedes-benz e-class 2017 premium_unleaded_(required) 241.0 4.0 automatic all_wheel_drive 4.0 luxury midsize sedan 29 22 617 54650
4794 ford focus 2017 flex-fuel_(unleaded/e85) 160.0 4.0 manual front_wheel_drive 4.0 flex_fuel compact sedan 36 26 5657 16775
10498 acura tlx 2016 premium_unleaded_(recommended) 290.0 6.0 automatic front_wheel_drive 4.0 luxury midsize sedan 34 21 204 42600
1880 volkswagen beetle_convertible 2016 regular_unleaded 170.0 4.0 automatic front_wheel_drive 2.0 NaN compact convertible 34 25 873 25995
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
11123 volvo v60 2015 regular_unleaded 240.0 4.0 automatic front_wheel_drive 4.0 luxury midsize wagon 37 25 870 35750
5549 maserati granturismo_convertible 2015 premium_unleaded_(required) 444.0 8.0 automatic rear_wheel_drive 2.0 exotic,luxury,high-performance midsize convertible 20 13 238 145740
4146 cadillac escalade_hybrid 2013 regular_unleaded 332.0 8.0 automatic rear_wheel_drive 4.0 luxury,hybrid large 4dr_suv 23 20 1624 74425
6337 mitsubishi lancer 2016 regular_unleaded 148.0 4.0 manual front_wheel_drive 4.0 NaN compact sedan 34 24 436 17595
9814 kia sorento 2015 regular_unleaded 290.0 6.0 automatic front_wheel_drive 4.0 crossover midsize 4dr_suv 25 18 1720 26700

2382 rows × 16 columns

In [783]:
# resetting the index with new shuffled dataframe
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

df_train
Out[783]:
make model year engine_fuel_type engine_hp engine_cylinders transmission_type driven_wheels number_of_doors market_category vehicle_size vehicle_style highway_mpg city_mpg popularity msrp
0 chevrolet cobalt 2008 regular_unleaded 148.0 4.0 manual front_wheel_drive 2.0 NaN compact coupe 33 24 1385 14410
1 toyota matrix 2012 regular_unleaded 132.0 4.0 automatic front_wheel_drive 4.0 hatchback compact 4dr_hatchback 32 25 2031 19685
2 subaru impreza 2016 regular_unleaded 148.0 4.0 automatic all_wheel_drive 4.0 hatchback compact 4dr_hatchback 37 28 640 19795
3 volkswagen vanagon 1991 regular_unleaded 90.0 4.0 manual rear_wheel_drive 3.0 NaN large passenger_minivan 18 16 873 2000
4 ford f-150 2017 flex-fuel_(unleaded/e85) 385.0 8.0 automatic four_wheel_drive 4.0 flex_fuel large crew_cab_pickup 21 15 5657 56260
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7145 bmw 4_series 2015 premium_unleaded_(required) 300.0 6.0 automatic rear_wheel_drive 2.0 luxury,performance midsize convertible 31 20 3916 54900
7146 volkswagen beetle 2015 premium_unleaded_(recommended) 210.0 4.0 automated_manual front_wheel_drive 2.0 hatchback,performance compact 2dr_hatchback 30 24 873 29215
7147 gmc sierra_1500 2015 flex-fuel_(unleaded/e85) 285.0 6.0 automatic four_wheel_drive 4.0 flex_fuel large extended_cab_pickup 22 17 549 34675
7148 rolls-royce ghost 2014 premium_unleaded_(required) 563.0 12.0 automatic rear_wheel_drive 4.0 exotic,luxury,performance large sedan 21 13 86 303300
7149 volkswagen cc 2017 premium_unleaded_(recommended) 200.0 4.0 automated_manual front_wheel_drive 4.0 performance midsize sedan 31 22 873 37820

7150 rows × 16 columns

In [784]:
df_train['msrp'].values
Out[784]:
array([ 14410,  19685,  19795, ...,  34675, 303300,  37820], dtype=int64)
In [785]:
np.log1p(df_train['msrp'].values)
Out[785]:
array([ 9.57574708,  9.887663  ,  9.89323518, ..., 10.45380308,
       12.62248099, 10.54061978])
In [786]:
y_train = np.log1p(df_train['msrp'].values)
y_val = np.log1p(df_val['msrp'].values)
y_test = np.log1p(df_test['msrp'].values)
In [787]:
# delete msrp variable to avoid accidentally using it
del df_train['msrp']
del df_val['msrp']
del df_test['msrp']
In [788]:
len(y_train)
Out[788]:
7150

Linear Regression¶

In [789]:
df_train.iloc[10]
Out[789]:
make                                 rolls-royce
model                     phantom_drophead_coupe
year                                        2015
engine_fuel_type     premium_unleaded_(required)
engine_hp                                  453.0
engine_cylinders                            12.0
transmission_type                      automatic
driven_wheels                   rear_wheel_drive
number_of_doors                              2.0
market_category        exotic,luxury,performance
vehicle_size                               large
vehicle_style                        convertible
highway_mpg                                   19
city_mpg                                      11
popularity                                    86
Name: 10, dtype: object
In [790]:
# taking engine_hp, city_mpg, popularity as our features
xi = df_train.loc[10, ['engine_hp', 'city_mpg', 'popularity']]
xi
Out[790]:
engine_hp     453.0
city_mpg         11
popularity       86
Name: 10, dtype: object
In [791]:
xi = xi.values
xi
Out[791]:
array([453.0, 11, 86], dtype=object)

The linear model:

$$ g(X_i) = W_0 + W_1 X_{i1} + W_2 X_{i2} + W_3 X_{i3} $$

In [792]:
w0 = 7.17           # bias term - prediction we make w/o knowing anything about the car
w = [0.01, 0.04, 0.002]     # weight for each feature
In [793]:
def linear_regression(xi):
    n = len(xi)

    pred = w0

    for j in range(n):
        pred = pred + w[j] * xi[j]

    return pred
In [794]:
linear_regression(xi)
Out[794]:
12.312
In [795]:
# undoing the log we applied
np.expm1(12.312)
Out[795]:
222347.2221101062
In [796]:
np.log1p(222347.2221101062)
Out[796]:
12.312

Linear Regression Vector form¶

In [797]:
# dot product for features and weights
def dot(xi, w):
    n = len(xi)

    res = 0.0

    for j in range(n):
        res = res + xi[j] * w[j]

    return res
In [798]:
def linear_regression(xi):
    return w0 + dot(xi, w)

performing dot product :

$$ g(X_i) = W_0 + (X_i)^T \cdot W $$

Dot product between the transposed feature vector and the weight vector: $$ (X_i)^T \cdot W = X_{i1}W_1 + X_{i2}W_2 + X_{i3}W_3 + \dots + X_{in}W_n $$

Bringing in W0 into the dot product:

$$ g(X_i) = W_0 \cdot X_{i0}+ (X_i)^T \cdot W $$

Where Xi0 = 1

Hence:   W = [W0 W1 W2 ...Wn ]
         Xi = [Xi0 Xi1 Xi2...Xin]

∴ $$ W^T \cdot X_i = (X_i)^T \cdot W = W_0 + (X_i)^T \cdot W $$

In [799]:
w_new = [w0] + w
In [800]:
[1] + [1, 2, 3]
Out[800]:
[1, 1, 2, 3]
In [801]:
w_new # acts as our W in our equation
Out[801]:
[7.17, 0.01, 0.04, 0.002]
In [802]:
def linear_regression(xi):
    xi = np.insert(xi, 0, 1 ) # adding 1 at index 0 to be Xi0 since xi is an array object
    return dot(xi, w_new)
In [803]:
linear_regression(xi)
Out[803]:
12.312

Linear regression is same even in vector form

In [804]:
# implementing this in a matrix of matrices
x1 =  [1, 148, 24, 1385]
x2 =  [1, 132, 24, 1385]
x10 = [1, 453, 11, 86]


X = [x1, x2, x10]
X = np.array(X)
X
Out[804]:
array([[   1,  148,   24, 1385],
       [   1,  132,   24, 1385],
       [   1,  453,   11,   86]])
In [805]:
w_new
Out[805]:
[7.17, 0.01, 0.04, 0.002]
In [806]:
def linear_regression(X):
    return X.dot(w_new)
In [807]:
linear_regression(X)
Out[807]:
array([12.38 , 12.22 , 12.312])

Training a linear regression model¶

In our LR we have :

$$ g(X) = (X)^T \cdot W $$

Hence:

$$ X \cdot W = y $$

In our equation, y can only be an approximate value (i.e XW $\approx$ y) as it is impossible to get the exact value

To get a value closest to y we need to check the system W

Substituting W we get:

$$ X \cdot W \approx y $$

$$ X^{-1} \cdot X \cdot W \approx y \cdot X^{-1} $$

$$ I \cdot W \approx y \cdot X^{-1} $$

$$ W \approx y \cdot X^{-1} $$

But X is not a square matrix, so getting its inverse is impossible.
So we need to find a way to convert it to a square matrix

∴ $$ X \cdot W \approx y $$

$$ X^T \cdot X \cdot W \approx y \cdot X^T $$

XT X is a gram matrix - a square matrix of X
Hence we need to multiply it by the inverse to seclude our W

$$ (X^T \cdot X)^{-1} \cdot (X^T \cdot X) \cdot W \approx y \cdot X^T \cdot (X^T \cdot X)^{-1} $$

$$ I \cdot W \approx y \cdot X^T \cdot (X^T \cdot X)^{-1} $$

$$ W \approx y \cdot X^T \cdot (X^T \cdot X)^{-1} $$

In [808]:
def train_linear_regression():
    pass
In [809]:
X = [
        [148, 24, 1385],
        [132, 25, 2031],
        [453, 11, 86],
        [158, 24, 185],
        [172, 25, 201],
        [413, 11, 86],
        [38, 54, 185],
        [142, 25, 431],
        [453, 31, 86],
]

X = np.array(X)
X
Out[809]:
array([[ 148,   24, 1385],
       [ 132,   25, 2031],
       [ 453,   11,   86],
       [ 158,   24,  185],
       [ 172,   25,  201],
       [ 413,   11,   86],
       [  38,   54,  185],
       [ 142,   25,  431],
       [ 453,   31,   86]])
In [810]:
XTX = X.T.dot(X)
XTX
Out[810]:
array([[ 696471,   44115,  718540],
       [  44115,    7146,  118803],
       [ 718540,  118803, 6359986]])
In [811]:
XTX_inv = np.linalg.inv(XTX)
XTX_inv.round()
Out[811]:
array([[ 0., -0.,  0.],
       [-0.,  0., -0.],
       [ 0., -0.,  0.]])
In [812]:
XTX.dot(XTX_inv).round() # proof exists
Out[812]:
array([[ 1.,  0.,  0.],
       [-0.,  1.,  0.],
       [ 0.,  0.,  1.]])
In [813]:
y = [100, 200, 150, 250, 100, 200, 150, 250, 120 ]
In [814]:
XTX_inv.dot(X.T).dot(y)
Out[814]:
array([0.26190562, 3.06101252, 0.03696909])
In [815]:
# adding bias term
X.shape
Out[815]:
(9, 3)
In [816]:
ones = np.ones(X.shape[0])
In [817]:
X = np.column_stack([ones, X])
X
Out[817]:
array([[1.000e+00, 1.480e+02, 2.400e+01, 1.385e+03],
       [1.000e+00, 1.320e+02, 2.500e+01, 2.031e+03],
       [1.000e+00, 4.530e+02, 1.100e+01, 8.600e+01],
       [1.000e+00, 1.580e+02, 2.400e+01, 1.850e+02],
       [1.000e+00, 1.720e+02, 2.500e+01, 2.010e+02],
       [1.000e+00, 4.130e+02, 1.100e+01, 8.600e+01],
       [1.000e+00, 3.800e+01, 5.400e+01, 1.850e+02],
       [1.000e+00, 1.420e+02, 2.500e+01, 4.310e+02],
       [1.000e+00, 4.530e+02, 3.100e+01, 8.600e+01]])
In [818]:
XTX = X.T.dot(X)
XTX
Out[818]:
array([[9.000000e+00, 2.109000e+03, 2.300000e+02, 4.676000e+03],
       [2.109000e+03, 6.964710e+05, 4.411500e+04, 7.185400e+05],
       [2.300000e+02, 4.411500e+04, 7.146000e+03, 1.188030e+05],
       [4.676000e+03, 7.185400e+05, 1.188030e+05, 6.359986e+06]])
In [819]:
XTX_inv = np.linalg.inv(XTX)
XTX_inv
Out[819]:
array([[ 3.30686958e+00, -5.39612291e-03, -6.21325581e-02,
        -6.61016816e-04],
       [-5.39612291e-03,  1.11633857e-05,  8.66973393e-05,
         1.08664195e-06],
       [-6.21325581e-02,  8.66973393e-05,  1.46189255e-03,
         8.57849603e-06],
       [-6.61016816e-04,  1.08664195e-06,  8.57849603e-06,
         3.60215866e-07]])
In [820]:
w_full = XTX_inv.dot(X.T).dot(y)
w_full
Out[820]:
array([ 3.00067767e+02, -2.27742529e-01, -2.57694130e+00, -2.30120640e-02])
In [821]:
w0 = w_full[0]
w = w_full[1:]
In [822]:
# coefficients for linear regression
w0, w
Out[822]:
(300.0677669255554, array([-0.22774253, -2.5769413 , -0.02301206]))
In [823]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])

    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]
In [824]:
X = [
        [148, 24, 1385],
        [132, 25, 2031],
        [453, 11, 86],
        [158, 24, 185],
        [172, 25, 201],
        [413, 11, 86],
        [38, 54, 185],
        [142, 25, 431],
        [453, 31, 86],
]

X = np.array(X)
X
Out[824]:
array([[ 148,   24, 1385],
       [ 132,   25, 2031],
       [ 453,   11,   86],
       [ 158,   24,  185],
       [ 172,   25,  201],
       [ 413,   11,   86],
       [  38,   54,  185],
       [ 142,   25,  431],
       [ 453,   31,   86]])
In [825]:
train_linear_regression(X, y)
Out[825]:
(300.0677669255554, array([-0.22774253, -2.5769413 , -0.02301206]))

Car price baseline model¶

In [826]:
df_train.dtypes
Out[826]:
make                  object
model                 object
year                   int64
engine_fuel_type      object
engine_hp            float64
engine_cylinders     float64
transmission_type     object
driven_wheels         object
number_of_doors      float64
market_category       object
vehicle_size          object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
popularity             int64
dtype: object
In [827]:
# using numerical columns
base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']
In [828]:
df_train[base]
Out[828]:
engine_hp engine_cylinders highway_mpg city_mpg popularity
0 148.0 4.0 33 24 1385
1 132.0 4.0 32 25 2031
2 148.0 4.0 37 28 640
3 90.0 4.0 18 16 873
4 385.0 8.0 21 15 5657
... ... ... ... ... ...
7145 300.0 6.0 31 20 3916
7146 210.0 4.0 30 24 873
7147 285.0 6.0 22 17 549
7148 563.0 12.0 21 13 86
7149 200.0 4.0 31 22 873

7150 rows × 5 columns

In [829]:
# to extract the values into a numpy array
X_train = df_train[base].values
X_train
Out[829]:
array([[ 148.,    4.,   33.,   24., 1385.],
       [ 132.,    4.,   32.,   25., 2031.],
       [ 148.,    4.,   37.,   28.,  640.],
       ...,
       [ 285.,    6.,   22.,   17.,  549.],
       [ 563.,   12.,   21.,   13.,   86.],
       [ 200.,    4.,   31.,   22.,  873.]])
In [830]:
y_train
Out[830]:
array([ 9.57574708,  9.887663  ,  9.89323518, ..., 10.45380308,
       12.62248099, 10.54061978])
In [831]:
df_train[base].isna().sum()
Out[831]:
engine_hp           40
engine_cylinders    14
highway_mpg          0
city_mpg             0
popularity           0
dtype: int64
In [832]:
# making model ignore these null features
X_train = df_train[base].fillna(0).values
In [833]:
# getting our weights
w0, w = train_linear_regression(X_train, y_train)
In [834]:
print(len(w))
print(X_train.shape)
5
(7150, 5)
In [835]:
y_pred = w0 + X_train.dot(w)
y_pred
Out[835]:
array([ 9.54792783,  9.38733977,  9.67197758, ..., 10.30423015,
       11.9778914 ,  9.99863111])
In [836]:
sns.histplot(y_pred, color='red', alpha=0.5)   # predictions
sns.histplot(y_train, color='blue', alpha=0.5) # target variables
Out[836]:
<Axes: ylabel='Count'>
No description has been provided for this image

Prediction is less than the target variable hence our model is not ideal

RMSE¶

Quantifying how well the model is

In [837]:
def rmse(y, y_pred):
    error = y - y_pred
    se = error ** 2
    mse = se.mean()
    return np.sqrt(mse)
In [838]:
rmse(y_train, y_pred)
Out[838]:
0.7554192603920132

Validating the model¶

In [839]:
df_val[base]
Out[839]:
engine_hp engine_cylinders highway_mpg city_mpg popularity
0 200.0 4.0 25 19 1385
1 241.0 4.0 29 22 617
2 160.0 4.0 36 26 5657
3 290.0 6.0 34 21 204
4 170.0 4.0 34 25 873
... ... ... ... ... ...
2377 240.0 4.0 37 25 870
2378 444.0 8.0 20 13 238
2379 332.0 8.0 23 20 1624
2380 148.0 4.0 34 24 436
2381 290.0 6.0 25 18 1720

2382 rows × 5 columns

In [840]:
# getting our feature matrix X
def prepare_x(df):
    df_num = df[base]
    df_num = df[base].fillna(0)
    X = df_num.values

    return X
In [ ]:
# training part
X_train = prepare_x(df_train)                        # extract clean X matrix
w0, w = train_linear_regression(X_train, y_train)    # gets weights for features of x

# validation
X_val = prepare_x(df_val)
y_pred = w0 + X_val.dot(w)

rmse(y_val, y_pred)
Out[ ]:
0.761653099130156

Feature Engineering¶

In [842]:
df_train.year.max()
Out[842]:
2017
In [ ]:
# adding age to our feature matrix
def prepare_x(df):
    df = df.copy() # so as to not modify the original df
    
    df['age'] = df.year.max() - df.year
    features = base + ['age']

    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values

    return X
In [844]:
X_train = prepare_x(df_train)
In [845]:
X_train
Out[845]:
array([[1.480e+02, 4.000e+00, 3.300e+01, 2.400e+01, 1.385e+03, 9.000e+00],
       [1.320e+02, 4.000e+00, 3.200e+01, 2.500e+01, 2.031e+03, 5.000e+00],
       [1.480e+02, 4.000e+00, 3.700e+01, 2.800e+01, 6.400e+02, 1.000e+00],
       ...,
       [2.850e+02, 6.000e+00, 2.200e+01, 1.700e+01, 5.490e+02, 2.000e+00],
       [5.630e+02, 1.200e+01, 2.100e+01, 1.300e+01, 8.600e+01, 3.000e+00],
       [2.000e+02, 4.000e+00, 3.100e+01, 2.200e+01, 8.730e+02, 0.000e+00]])
In [846]:
# training part
# X_train = prepare_x(df_train)
w0, w = train_linear_regression(X_train, y_train)

# validation
X_val = prepare_x(df_val)
y_pred = w0 + X_val.dot(w)

rmse(y_val, y_pred)
Out[846]:
0.5172055461058299

The lower RMSE means the model’s predictions are closer to the actual values, indicating better model performance.

In [847]:
sns.histplot(y_pred, color='red', alpha=0.5, bins=50)   # predictions
sns.histplot(y_val, color='blue', alpha=0.5, bins=50) # target variables
Out[847]:
<Axes: ylabel='Count'>
No description has been provided for this image

Categorical Values¶

In [848]:
df_train
Out[848]:
make model year engine_fuel_type engine_hp engine_cylinders transmission_type driven_wheels number_of_doors market_category vehicle_size vehicle_style highway_mpg city_mpg popularity
0 chevrolet cobalt 2008 regular_unleaded 148.0 4.0 manual front_wheel_drive 2.0 NaN compact coupe 33 24 1385
1 toyota matrix 2012 regular_unleaded 132.0 4.0 automatic front_wheel_drive 4.0 hatchback compact 4dr_hatchback 32 25 2031
2 subaru impreza 2016 regular_unleaded 148.0 4.0 automatic all_wheel_drive 4.0 hatchback compact 4dr_hatchback 37 28 640
3 volkswagen vanagon 1991 regular_unleaded 90.0 4.0 manual rear_wheel_drive 3.0 NaN large passenger_minivan 18 16 873
4 ford f-150 2017 flex-fuel_(unleaded/e85) 385.0 8.0 automatic four_wheel_drive 4.0 flex_fuel large crew_cab_pickup 21 15 5657
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7145 bmw 4_series 2015 premium_unleaded_(required) 300.0 6.0 automatic rear_wheel_drive 2.0 luxury,performance midsize convertible 31 20 3916
7146 volkswagen beetle 2015 premium_unleaded_(recommended) 210.0 4.0 automated_manual front_wheel_drive 2.0 hatchback,performance compact 2dr_hatchback 30 24 873
7147 gmc sierra_1500 2015 flex-fuel_(unleaded/e85) 285.0 6.0 automatic four_wheel_drive 4.0 flex_fuel large extended_cab_pickup 22 17 549
7148 rolls-royce ghost 2014 premium_unleaded_(required) 563.0 12.0 automatic rear_wheel_drive 4.0 exotic,luxury,performance large sedan 21 13 86
7149 volkswagen cc 2017 premium_unleaded_(recommended) 200.0 4.0 automated_manual front_wheel_drive 4.0 performance midsize sedan 31 22 873

7150 rows × 15 columns

In [849]:
df_train.dtypes[df_train.dtypes == 'object']
Out[849]:
make                 object
model                object
engine_fuel_type     object
transmission_type    object
driven_wheels        object
market_category      object
vehicle_size         object
vehicle_style        object
dtype: object
In [850]:
df_train['number_of_doors'].unique()
Out[850]:
array([ 2.,  4.,  3., nan])
In [851]:
# adding number of doors in our X-feature matrx
def prepare_x(df):
    df = df.copy() # so as to not modify the original df
    features = base.copy()
    
    df['age'] = df.year.max() - df.year
    features.append('age')

    # converting to binary column for each number of door
    for v in [2, 3, 4]:
        df['num_doors_%s' % v] = (df['number_of_doors'] == v).astype('int')
        features.append('num_doors_%s' % v)

    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values

    return X
In [852]:
# training part
X_train = prepare_x(df_train)
w0, w = train_linear_regression(X_train, y_train)

# validation
X_val = prepare_x(df_val)
y_pred = w0 + X_val.dot(w)

rmse(y_val, y_pred)
Out[852]:
0.5157995641501689
In [853]:
makes = list(df['make'].value_counts().head().index)
In [854]:
# adding makes in our X-feature matrx
def prepare_x(df):
    df = df.copy() # so as to not modify the original df
    features = base.copy()
    
    df['age'] = df.year.max() - df.year
    features.append('age')

    # converting to binary column for each number of door
    for v in [2, 3, 4]:
        df['num_doors_%s' % v] = (df['number_of_doors'] == v).astype('int')
        features.append('num_doors_%s' % v)

    for v in makes:
        df['make_%s' % v] = (df['make'] == v).astype('int')
        features.append('make_%s' % v)

    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values

    return X
In [855]:
# training part
X_train = prepare_x(df_train)
w0, w = train_linear_regression(X_train, y_train)

# validation
X_val = prepare_x(df_val)
y_pred = w0 + X_val.dot(w)

rmse(y_val, y_pred)
Out[855]:
0.5076038849556795
In [ ]: